/*
 * Copyright (c) 2012-2014 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

	/* Basic ARM implementation.  This should run on anything except
	   for ARMv6-M, but there are better implementations for later
	   revisions of the architecture.  This version can support ARMv4T
	   ARM/Thumb interworking.  */
/* Parameters and result.  */
#define src1		r0
#define src2		r1
#define result		r0	/* Overlaps src1.  */

/* Internal variables.  */
#define data1		r2
#define data2		r3
#define magic1		r4
#define tmp2		r5
#define tmp1		r12
#define syndrome	r12	/* Overlaps tmp1 */

/* For armv4t and newer, toolchains will transparently convert
   'bx lr' to 'mov pc, lr' if needed. GCC has deprecated support
   for anything older than armv4t, but this should handle that
   corner case in case anyone needs it anyway */
.macro  RETURN
#if __ARM_ARCH <= 4 && __ARM_ARCH_ISA_THUMB == 0
	mov	pc, lr
#else
	bx	lr
#endif
.endm

	.arm
def_fn strcmp
	.cfi_sections .debug_frame
	.cfi_startproc
	eor	tmp1, src1, src2
	tst	tmp1, #3
	/* Strings not at same byte offset from a word boundary.  */
	bne	.Lstrcmp_unaligned
	ands	tmp1, src1, #3
	bic	src1, src1, #3
	bic	src2, src2, #3
	ldr	data1, [src1], #4
	ldreq	data2, [src2], #4
	beq	1f
	/* Although s1 and s2 have identical initial alignment, they are
	   not currently word aligned.	Rather than comparing bytes,
	   make sure that any bytes fetched from before the addressed
	   bytes are forced to 0xff.  Then they will always compare
	   equal.  */
	eor	tmp1, tmp1, #3
	mvn	data2, #MSB
	lsl	tmp1, tmp1, #3
	S2LO	tmp1, data2, tmp1
	ldr	data2, [src2], #4
	orr	data1, data1, tmp1
	orr	data2, data2, tmp1
1:
	/* Load the 'magic' constant 0x01010101.	*/
	str	r4, [sp, #-4]!
	.cfi_def_cfa_offset 4
	.cfi_offset 4, -4
	mov	magic1, #1
	orr	magic1, magic1, magic1, lsl #8
	orr	magic1, magic1, magic1, lsl #16
	.p2align	2
4:
	sub	syndrome, data1, magic1
	cmp	data1, data2
	/* check for any zero bytes in first word */
	biceq	syndrome, syndrome, data1
	tsteq	syndrome, magic1, lsl #7
	ldreq	data1, [src1], #4
	ldreq	data2, [src2], #4
	beq	4b
2:
	/* There's a zero or a different byte in the word */
	S2HI	result, data1, #24
	S2LO	data1, data1, #8
	cmp	result, #1
	cmpcs	result, data2, S2HI #24
	S2LOEQ	data2, data2, #8
	beq	2b
	/* On a big-endian machine, RESULT contains the desired byte in bits
	   0-7; on a little-endian machine they are in bits 24-31.  In
	   both cases the other bits in RESULT are all zero.  For DATA2 the
	   interesting byte is at the other end of the word, but the
	   other bits are not necessarily zero.	 We need a signed result
	   representing the differnece in the unsigned bytes, so for the
	   little-endian case we can't just shift the interesting bits
	   up.	*/
#ifdef __ARM_BIG_ENDIAN
	sub	result, result, data2, lsr #24
#else
	and	data2, data2, #255
	rsb	result, data2, result, lsr #24
#endif
	ldr	r4, [sp], #4
	.cfi_restore 4
	.cfi_def_cfa_offset 0
	RETURN


#if 0
	/* The assembly code below is based on the following alogrithm.	 */
#ifdef __ARM_BIG_ENDIAN
#define RSHIFT <<
#define LSHIFT >>
#else
#define RSHIFT >>
#define LSHIFT <<
#endif

#define body(shift)							\
  mask = 0xffffffffU RSHIFT shift;					\
  data1 = *src1++;							\
  data2 = *src2++;							\
  do									\
    {									\
      tmp2 = data1 & mask;						\
      if (__builtin_expect(tmp2 != data2 RSHIFT shift, 0))		\
	{								\
	  data2 RSHIFT= shift;						\
	  break;							\
	}								\
      if (__builtin_expect(((data1 - b1) & ~data1) & (b1 << 7), 0))	\
	{								\
	  /* See comment in assembler below re syndrome on big-endian */\
	  if ((((data1 - b1) & ~data1) & (b1 << 7)) & mask)		\
	    data2 RSHIFT= shift;					\
	  else								\
	    {								\
	      data2 = *src2;						\
	      tmp2 = data1 RSHIFT (32 - shift);				\
	      data2 = (data2 LSHIFT (32 - shift)) RSHIFT (32 - shift);	\
	    }								\
	  break;							\
	}								\
      data2 = *src2++;							\
      tmp2 ^= data1;							\
      if (__builtin_expect(tmp2 != data2 LSHIFT (32 - shift), 0))	\
	{								\
	  tmp2 = data1 >> (32 - shift);					\
	  data2 = (data2 << (32 - shift)) RSHIFT (32 - shift);		\
	  break;							\
	}								\
      data1 = *src1++;							\
    } while (1)

  const unsigned* src1;
  const unsigned* src2;
  unsigned data1, data2;
  unsigned mask;
  unsigned shift;
  unsigned b1 = 0x01010101;
  char c1, c2;
  unsigned tmp2;

  while (((unsigned) s1) & 3)
    {
      c1 = *s1++;
      c2 = *s2++;
      if (c1 == 0 || c1 != c2)
	return c1 - (int)c2;
    }
  src1 = (unsigned*) (((unsigned)s1) & ~3);
  src2 = (unsigned*) (((unsigned)s2) & ~3);
  tmp2 = ((unsigned) s2) & 3;
  if (tmp2 == 1)
    {
      body(8);
    }
  else if (tmp2 == 2)
    {
      body(16);
    }
  else
    {
      body (24);
    }

  do
    {
#ifdef __ARM_BIG_ENDIAN
      c1 = (char) tmp2 >> 24;
      c2 = (char) data2 >> 24;
#else /* not  __ARM_BIG_ENDIAN */
      c1 = (char) tmp2;
      c2 = (char) data2;
#endif /* not  __ARM_BIG_ENDIAN */
      tmp2 RSHIFT= 8;
      data2 RSHIFT= 8;
    } while (c1 != 0 && c1 == c2);
  return c1 - c2;
#endif /* 0 */


	/* First of all, compare bytes until src1(sp1) is word-aligned. */
.Lstrcmp_unaligned:
	tst	src1, #3
	beq	2f
	ldrb	data1, [src1], #1
	ldrb	data2, [src2], #1
	cmp	data1, #1
	cmpcs	data1, data2
	beq	.Lstrcmp_unaligned
	sub	result, data1, data2
	RETURN

2:
	stmfd	sp!, {r4, r5}
	.cfi_def_cfa_offset 8
	.cfi_offset 4, -8
	.cfi_offset 5, -4
	mov	magic1, #1
	orr	magic1, magic1, magic1, lsl #8
	orr	magic1, magic1, magic1, lsl #16

	ldr	data1, [src1], #4
	and	tmp2, src2, #3
	bic	src2, src2, #3
	ldr	data2, [src2], #4
	cmp	tmp2, #2
	beq	.Loverlap2
	bhi	.Loverlap1

	/* Critical inner Loop: Block with 3 bytes initial overlap */
	.p2align	2
.Loverlap3:
	bic	tmp2, data1, #MSB
	cmp	tmp2, data2, S2LO #8
	sub	syndrome, data1, magic1
	bic	syndrome, syndrome, data1
	bne	4f
	ands	syndrome, syndrome, magic1, lsl #7
	ldreq	data2, [src2], #4
	bne	5f
	eor	tmp2, tmp2, data1
	cmp	tmp2, data2, S2HI #24
	bne	6f
	ldr	data1, [src1], #4
	b	.Loverlap3
4:
	S2LO	data2, data2, #8
	b	.Lstrcmp_tail

5:
#ifdef __ARM_BIG_ENDIAN
	/* The syndrome value may contain false ones if the string ends
	with the bytes 0x01 0x00.  */
	tst	data1, #0xff000000
	tstne	data1, #0x00ff0000
	tstne	data1, #0x0000ff00
	beq	.Lstrcmp_done_equal
#else
	bics	syndrome, syndrome, #0xff000000
	bne	.Lstrcmp_done_equal
#endif
	ldrb	data2, [src2]
	S2LO	tmp2, data1, #24
#ifdef __ARM_BIG_ENDIAN
	lsl	data2, data2, #24
#endif
	b	.Lstrcmp_tail

6:
	S2LO	tmp2, data1, #24
	and	data2, data2, #LSB
	b	.Lstrcmp_tail

	/* Critical inner Loop: Block with 2 bytes initial overlap.  */
	.p2align	2
.Loverlap2:
	S2HI	tmp2, data1, #16
	sub	syndrome, data1, magic1
	S2LO	tmp2, tmp2, #16
	bic	syndrome, syndrome, data1
	cmp	tmp2, data2, S2LO #16
	bne	4f
	ands	syndrome, syndrome, magic1, lsl #7
	ldreq	data2, [src2], #4
	bne	5f
	eor	tmp2, tmp2, data1
	cmp	tmp2, data2, S2HI #16
	bne	6f
	ldr	data1, [src1], #4
	b	.Loverlap2

5:
#ifdef __ARM_BIG_ENDIAN
	/* The syndrome value may contain false ones if the string ends
	with the bytes 0x01 0x00 */
	tst	data1, #0xff000000
	tstne	data1, #0x00ff0000
	beq	.Lstrcmp_done_equal
#else
	lsls	syndrome, syndrome, #16
	bne	.Lstrcmp_done_equal
#endif
	ldrh	data2, [src2]
	S2LO	tmp2, data1, #16
#ifdef __ARM_BIG_ENDIAN
	lsl	data2, data2, #16
#endif
	b	.Lstrcmp_tail

6:
	S2HI	data2, data2, #16
	S2LO	tmp2, data1, #16
4:
	S2LO	data2, data2, #16
	b	.Lstrcmp_tail

	/* Critical inner Loop: Block with 1 byte initial overlap.  */
	.p2align	2
.Loverlap1:
	and	tmp2, data1, #LSB
	cmp	tmp2, data2, S2LO #24
	sub	syndrome, data1, magic1
	bic	syndrome, syndrome, data1
	bne	4f
	ands	syndrome, syndrome, magic1, lsl #7
	ldreq	data2, [src2], #4
	bne	5f
	eor	tmp2, tmp2, data1
	cmp	tmp2, data2, S2HI #8
	bne	6f
	ldr	data1, [src1], #4
	b	.Loverlap1
4:
	S2LO	data2, data2, #24
	b	.Lstrcmp_tail
5:
	/* The syndrome value may contain false ones if the string ends
	   with the bytes 0x01 0x00.  */
	tst	data1, #LSB
	beq	.Lstrcmp_done_equal
	ldr	data2, [src2], #4
6:
	S2LO	tmp2, data1, #8
	bic	data2, data2, #MSB
	b	.Lstrcmp_tail
.Lstrcmp_done_equal:
	mov	result, #0
	.cfi_remember_state
	ldmfd	sp!, {r4, r5}
	.cfi_restore 4
	.cfi_restore 5
	.cfi_def_cfa_offset 0
	RETURN

.Lstrcmp_tail:
	.cfi_restore_state
	and	r2, tmp2, #LSB
	and	result, data2, #LSB
	cmp	result, #1
	cmpcs	result, r2
	S2LOEQ	tmp2, tmp2, #8
	S2LOEQ	data2, data2, #8
	beq	.Lstrcmp_tail
	sub	result, r2, result
	ldmfd	sp!, {r4, r5}
	.cfi_restore 4
	.cfi_restore 5
	.cfi_def_cfa_offset 0
	RETURN
	.cfi_endproc
	.size strcmp, . - strcmp
